In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# BAGGING, BOOSTING, and RANDOM FORESTS

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt;
from ISLP import load_data

# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')  
In [12]:
print(Auto.columns)
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin'],
      dtype='object')
In [10]:
# X = Auto.drop(columns=["mpg", "name"])
X = Auto.drop(columns=["mpg"])
y = Auto["mpg"]
In [14]:
# 1. Bagging and Random Forests
# Random forest model with default settings (similar to R's default m = p/3)
rf = RandomForestRegressor()
rf.fit(X, y)

# Model summary
print(f"Mean Squared Error: {mean_squared_error(y, rf.predict(X)):.4f}")
print(f"Variance explained (%): {rf.score(X, y) * 100:.2f}")
Mean Squared Error: 1.0002
Variance explained (%): 98.35
In [16]:
# Feature importance
importance = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance}).sort_values(by='Importance', ascending=False)
print(importance_df)

# Plotting feature importance
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel("Importance")
plt.title("Feature Importance in Random Forest")
plt.show()
        Feature  Importance
1  displacement    0.375632
3        weight    0.205921
0     cylinders    0.140268
2    horsepower    0.121348
5          year    0.120379
4  acceleration    0.030020
6        origin    0.006431
No description has been provided for this image
In [24]:
# 2. Cross-validation

# Reset indices for X and y
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Randomly split data into training and validation sets
Z = np.random.choice(len(y), 200, replace=False)

# Select training and test sets based on Z
X_train, X_test = X.loc[Z], X.drop(Z)
y_train, y_test = y.loc[Z], y.drop(Z)
In [26]:
# Fit random forest on training subset
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Calculate mean squared error
mse_cv = mean_squared_error(y_test, y_pred)
print(f"Cross-validated MSE: {mse_cv:.4f}")
Cross-validated MSE: 6.1369
In [40]:
# 3. Searching for the optimal solution

reg_tree = RandomForestRegressor(oob_score=True, n_estimators=147, max_features=3, random_state=42)
reg_tree.fit(X_train, y_train)

# Access the OOB score
print("OOB Score:", reg_tree.oob_score_)

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(30, 20))
plot_tree(reg_tree.estimators_[0], feature_names=X_train.columns, filled=True, fontsize=10)
plt.show()
OOB Score: 0.828643068370655
No description has been provided for this image
In [42]:
# Sample sizes for trees
n_trees = range(1, 300, 10)  # You can adjust this range as needed
oob_scores = []

# Collect OOB scores for different numbers of trees
for n in n_trees:
    reg_tree = RandomForestRegressor(n_estimators=n, oob_score=True, random_state=42)
    reg_tree.fit(X_train, y_train)
    oob_scores.append(reg_tree.oob_score_)

# Plotting OOB error
plt.figure(figsize=(10, 6))
plt.plot(n_trees, oob_scores, marker='o', linestyle='-', color='b')
plt.title('OOB Score vs. Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('OOB Score')
plt.grid()
plt.show()
C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:615: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
  warn(
No description has been provided for this image
In [48]:
# Finding the optimal number of trees to minimize MSE and maximize R²
errors = reg_tree.oob_score_
optimal_trees = np.argmin(errors) + 1
print(f"Optimal number of trees: {optimal_trees}")

# Final random forest with optimal trees
rf_optimal = RandomForestRegressor(max_features=7, n_estimators=optimal_trees)
rf_optimal.fit(X, y)

print(f"Optimal Model MSE: {mean_squared_error(y, rf_optimal.predict(X)):.4f}")
print(f"Optimal Model R²: {rf_optimal.score(X, y) * 100:.2f}")
Optimal number of trees: 1
Optimal Model MSE: 5.9720
Optimal Model R²: 90.17
In [46]:
# 4. Cross-validation loop to find the best m and number of trees
cv_errors = []
n_trees = []

for m in range(1, 8):
    rf_m = RandomForestRegressor(max_features=m, oob_score=True)
    rf_m.fit(X_train, y_train)
    mse_min = np.min(rf_m.oob_score_)
    opt_trees = np.argmin(rf_m.oob_score_) + 1
    cv_errors.append(mse_min)
    n_trees.append(opt_trees)

# Plot CV errors
plt.figure(figsize=(10, 6))
plt.plot(range(1, 8), cv_errors, marker='o')
plt.xlabel("Number of Features (m)")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Error for Different m Values")
plt.show()

# Print results
optimal_m = np.argmin(cv_errors) + 1
print(f"Optimal m: {optimal_m}, with MSE: {cv_errors[optimal_m-1]:.4f}")

# Optimal random forest with best m and number of trees
rf_final = RandomForestRegressor(max_features=optimal_m, n_estimators=n_trees[optimal_m-1])
rf_final.fit(X, y)

print(f"Final Optimal Model MSE: {mean_squared_error(y, rf_final.predict(X)):.4f}")
print(f"Final Optimal Model R²: {rf_final.score(X, y) * 100:.2f}")

# Feature importance for the optimal model
importance_optimal = rf_final.feature_importances_
importance_df_optimal = pd.DataFrame({'Feature': features, 'Importance': importance_optimal}).sort_values(by='Importance', ascending=False)
print(importance_df_optimal)
No description has been provided for this image
Optimal m: 7, with MSE: 0.8105
Final Optimal Model MSE: 4.7235
Final Optimal Model R²: 92.23
        Feature  Importance
0     cylinders    0.651802
2    horsepower    0.130907
5          year    0.101565
3        weight    0.066039
4  acceleration    0.031475
1  displacement    0.015715
6        origin    0.002498